###############################################################################
#
# ptsecstruct.py - object to represent secondary structure elements (SSEs) and
#            their associated Hydrogen bonds and functions to parse
#            these from external programs (DSSP, STRIDE, etc.)
#
# File:    ptsecstruct.py
# Author:  Alex Stivala
# Created: July 2007
#
# $Id: ptsecstruct.py 2 2011-10-13 04:20:13Z stivala2003@yahoo.com.au $
#
# PTSecStruct is a class representing secondary structures (helices, sheets)
# and their associated hydrogen bonds.
#
# This module contains a class to read protein secondary structure
# information generated by:
#
# STRIDE
# (see http://webclu.bio.wzw.tum.de/stride/
# download source code from ftp://ftp.ebi.ac.uk/pub/software/unix/stride/)
# See
# Frishman and Argos 1995 "Knowledge-based secondary structure assignment"
# Proteins: structure function and genetics 23:566-579.
#
# DSSP
# see http://swift.cmbi.ru.nl/gv/dssp/
# Note that a licence (free for academic use) is required.
# See also
# Kabsch and Sander 1983 "Dictionary of Protein Secondary Structure:
# Pattern Recognition of Hydrogen-Bonded and Geometrical Features"
# Biopolymers 22:2577-2637).
#
###############################################################################

import os,sys
from time import strftime,localtime

from ptutils import get_int_icode,pdb_res_seq_cmp


#-----------------------------------------------------------------------------
#
# Module globals
#
#-----------------------------------------------------------------------------

# constants


DSSP_HBOND_ENERGY_THRESHOLD = -0.5  # (kcal/mol)
                                    # DSSP hbonds must have energy <= this
                                    # to be considered.

# global variables

verbose = False

#-----------------------------------------------------------------------------
#
# Class definitions 
#
#-----------------------------------------------------------------------------

class PTSecStruct:
    """
    The PTSecStruct class represents secondary structure 
    
    Just access the helix_list and strand_list directly and pdb_id
    and other member data directly.

    Member data (set and ready to read after calling this routine):
        helix_list - list of helix tuples in form:
                     (start_chainid, start_resnum, end_chainid, end_resnum,
                     type)
                     where type is the DSSP code ('H', 'G' or 'I').
                     NB start_resnum and and end_resnum are PDB residue numbers
                     and are strings, not integers. They may contain
                     an insertion code, so e.g. may have residue numbers
                     '60','60A','60B','61' which are 4 distinct and sequtnially
                     adjacnet residues. Also may have gapes e.g. '58','70'
                     adjacent in sequence.
        strand_list - list of strand tuples, as per helix_list, but without
                       helix type
        pdb_id      - PDB identifier
        pdb_header  - PDB header string (description)
        hbond_list  - list of hydrogen bond tuples of the form:
                      (chainid1, resnum1, chainid2, resnum2, dist_N_O)
        bridgeres_list - list of residues in bridges in tuples of form:
                        (chainid1, resnum1, chainid2, resnum2, bdir)
                         bdir is P for parallel or N for antiparallel.
                       Note for hbond_list and brideres_list: resnums
                       are PDB residue numbers as strings, may have insertino
                       codes.
    """
    def __init__(self):
        """
        Initialize empty PTSecStruct object.
        See class documentation for more information.
        """
        self.helix_list = []     # list of tuples, see comments above
        self.strand_list = []    # as above
        self.hbond_list = []     # list of tuples, see comments above
        self.bridgeres_list = [] # list of bridge residue tuples (see above)

    
    def __str__(self):
        """
        Return string representation of secondary structure as, one per line,
        sse type (H or E like DSSP) followed by start and end residue
        numbers then chain id. Ordered by chain then residue number.

        NOTE H bonds and bridges not included in string representation.
        """
        tuple_list = self.get_sse_tuple_list()
        s = ""
        for sse_tuple in tuple_list:
            s += sse_tuple[3]  + "\t" + str(sse_tuple[1]) +\
                 "\t" + str(sse_tuple[2]) + \
                 "\t" + sse_tuple[0] + "\n"
        return s

    
    def get_sse_tuple_list(self):
        """
        build sse tuple list of (chain, start_res, end_res, type) and sort
        type is 'H' or 'E'.
        Parameters: None
        Return value: list of (chain,strartres,end_res,type)
        """
        # build sse tuple list of (chain, start_res, end_res, type) and sort
        tuple_list = [ (chain, start_res, end_res, 'H')  for  \
                       (chain, start_res, end_chainid, end_res, htype) in \
                       self.helix_list ]
        tuple_list += [ (chain, start_res, end_res, 'E')  for  \
                       (chain, start_res, end_chainid, end_res) in \
                       self.strand_list ]
        tuple_list.sort(cmp=tuplecmp)
        return tuple_list
    

    def get_num_sses(self):
        """
        Return the total number of SSEs, which is the number of helices
        plus number of strands.

        Parameters: None
        Uses data members (readonly):
           helix_list, strand_list
        Return value: number of helices + number of strands
        """
        return len(self.helix_list) + len(self.strand_list)


    def check_validity_and_fix(self):
        """
        Check for overlapping secondary structures. This happens for
        example in the PDB HELIX records for 1DLC.  In such a case we
        recover from it in for example this case
        by adding or subtracting one to start/end of ovlerlapping
        HELIX records,

        Parameters:
          None
        Return value:
          True if OK, False if invalid (overlapping structures)
          (Now returns True if it has fixed up overlaps itself)
        Uses data members (READ/WRITE):
           helix_list, strand_list
            (start and end in helix and strand tuples may be modified;
            lists are sorted by increasing residue sequence number)
        """
        helices = [ (chain, start, end, endchain, 'H', htype) 
                    for (chain, start, endchain, end, htype)
                    in self.helix_list ]
        strands = [ (chain, start, end, endchain, 'E', None)
                    for (chain, start, endchain, end)
                    in self.strand_list ]
        sselist = helices + strands
        sselist.sort(cmp=tuplecmp)
        is_valid = True
        for i in xrange(1, len(sselist)):
            sse = sselist[i]
            prevsse = sselist[i-1]
            if (prevsse[0] == sse[0] and
                pdb_res_seq_cmp(sse[1], prevsse[2]) <= 0):
                sys.stderr.write('WARNING: PDB has overlapping SSE definitions'
                                 ' ' + str(prevsse) + ' and ' + str(sse) + ': ')
                # remove overlap by shortening longer one and lengthing
                # shorter one
                # FIXME: this is ignoring insertion codes etc., really
                # should convert to proper sequential residue sequence numbers
                # to do this
                (prevsse_start,prevsse_start_icode) = get_int_icode(prevsse[1])
                (prevsse_end,prevsse_end_icode) = get_int_icode(prevsse[2])
                (sse_start,sse_start_icode) = get_int_icode(sse[1])
                (sse_end,sse_end_icode) = get_int_icode(sse[2])
                if (prevsse_end_icode or sse_start_icode):
                    sys.stderr.write('contains insertion codes, giving up\n')
                    is_valid = False
                    continue
                prevsse_len = prevsse_end - prevsse_start + 1
                sse_len = sse_end - sse_start + 1
                overlap = prevsse_end - sse_start + 1
                if sse_len > prevsse_len:
                    sse_start += overlap
                else:
                    prevsse_end -= overlap
                sselist[i] = (sse[0],str(sse_start),str(sse_end),
                               sse[3],sse[4],sse[5])
                sselist[i-1] = (prevsse[0],str(prevsse_start),str(prevsse_end),
                                 prevsse[3],prevsse[4],prevsse[5])
                sys.stderr.write('changed to ' + str(sselist[i-1]) + ' and ' +
                                 str(sselist[i]) + '\n')
            i += 1

        # rebuild the helix_list and strand_list with our modified tuples
        self.helix_list = [ (chain, start, endchain, end, htype)
                            for (chain, start, end, endchain, ssetype, htype)
                            in sselist if ssetype == 'H' ]
        self.strand_list = [ (chain, start, endchain, end) 
                             for (chain, start, end, endchain, ssetype, htype)
                             in sselist if ssetype == 'E' ]
        return is_valid
    

    def write_pymol_sse_commands(self, fh, pdbfilename):
        """
        Write PyMOL command file (use @filename in PyMOL) to load the PDB
        and define SSEs according to the definition we used here (PDB cards
        or STRIDE or DSSP) rather than PyMOL's own (DSS or PDB) definition.
    
        Parameters:
           fh - open (write) filehandle to write commands to
           pdbfilename -filename of the PDB file to load
        Return value:
           None
        """
        write_pymol_prelude(fh)
        fh.write('# ' + self.pdb_id + '\n')
        fh.write('# ' + self.pdb_header + '\n')
        fh.write('#\n')
        write_pymol_load(fh, self.pdb_id, pdbfilename)
        write_pml_define_sses(fh, self.pdb_id, self.get_sse_tuple_list())
        write_pymol_conclusion(fh)


    
#-----------------------------------------------------------------------------
#
# Function definitions
#
#-----------------------------------------------------------------------------
                        

def read_secstruct_from_stride(pdb_filename, pdb_secstruct = None):
    """
    Build and return an instance of the PTSecStruct class
    which represents the secondary structure as assigned
    by the STRIDE program
    (Frishman and Argos 1995 "Knowledge-based secondary structure assignment"
    Proteins: structure function and genetics 23:566-579).

    Parameters:
        pdb_filename - filename of PDB file to run STRIDE against
        pdb_secstruct (read/write) default None
                       - a previously built PTSecStruct
                        (eg from PDB HELIX and SHEET cards).
                        If this is not None, only hydrogen bond information
                        is read, and added to this PTSecstruct.

    Return value:
        PTSecStruct class representing secondary structure and H bonds
        ass assigned by STRIDE (see following comments).
        
    The class is instantiated by supplying a file handle of an open
    PDB file (for reading)
    to the constructor. STRIDE is run with that PDB file as input and
    the secondary structure assignments are parsed
    from the STRIDE output.

    If the H-bond information is to be read then stride must have been
    invoked with the -h option to generate the DNR and ACC records.
    As described in the STRIDE documentation (stride.doc in the stride
    source code distribution), these are very redundant in order to
    facilitiate human readability. This subroutine uses only the DNR
    records.

    In order to read bridge information we use the 'private' STRIDE option
    -i which requires the 'secret' option -$ to work i.e. a command line like
    (note necessity to escape $ in shell):

    stride -h -\$ -i 1QLP.pdb

    Actually, a modified version of stride is required, that indicates
    whether bridges are parallel or antiparallel (amongst other
    things), creating the new record types FA1 and FA2 from some of
    the 'private' -i info. This modified version should be supplied
    with this module (OK for academic use only; see the notice in
    stride.doc in the stride directory).
    """

    if verbose:
        sys.stderr.write("running stride...")
    fd = os.popen("stride -\$ -i -h " + pdb_filename)
    ptsecstruct = parse_stride_output(fd, pdb_secstruct)
    fd.close()
    if verbose:
        sys.stderr.write("done\n")
        sys.stderr.write(str(ptsecstruct))
    return ptsecstruct
    
        
def parse_stride_output(filehandle, pdb_secstruct):
    """
    Build StrideStruct object data members using the STRIDE output
    read from the supplied filehandle.
    See more comments in read_secstruct_from_stride() above.

    Parameters:
      filehandle - filehandle to read from (already open for read)
       pdb_secstruct (read/write)
                       - a previously built PTSecStruct
                        (eg from PDB HELIX and SHEET cards).
                        If this is not None, only hydrogen bond information
                        is read, and added to this PTSecstruct.

    Return value:
      PTSecStruct object representing secondary sturcture and H bonds
      as assigned by STRIDE. 

    """
    if pdb_secstruct == None:
        pts = PTSecStruct()
        hbonds_only = False
    else:
        pts = pdb_secstruct
        hbonds_only = True

    # From the STRIDE documentation:
    #
    #      STRIDE produces output that is easily readable both
    #      visually and with computer programs. The side effect of
    #      this conveniency is larger file size of individual
    #      STRIDE entries. Every record is 79 symbols long and has
    #      the following general format:
    #
    #      Position        Description
    #
    #      1-3             Record code
    #      4-5             Not used
    #      6-73            Data
    #      74-75           Not used
    #      75-79           Four letter PDB code (if available)

    chainid_dict = {}

    for line in filehandle:
        rectype = line[0:3]
        if rectype == "HDR":
            # read pdb_header and pdb_id from HDR records
            pts.pdb_header = line[5:74]
            pts.pdb_id = line[74:79].lstrip()
        elif rectype == "LOC" and not hbonds_only:
            #
            # Build helix_list and strand_list from LOC record
            # types AlphaHelix/310Helix/PiHelix and Strand respectively.
            # The helix_list is a list of tuples
            #
            #  (start_chainid, start_resnum, end_chainid, end_resnum, type)
            #
            # and the strand_list is the same format.
            #
            # From the stride doc: [lines marked WRONG are incorrect though]
            #
            # LOC    Location of secondary structure elements
            #
            # Format:  6-17 Element name
            #         19-21 First residue name
            # WRONG   32-26 First residue PDB number
            # WRONG   28-28 First residue chain identifier
            #         36-38 Last residue name
            #         42-45 Last residue PDB number
            #         47-47 Last residue chain identifier
            #
            element_name = line[5:17].rstrip().upper()
            start_chainid = line[28]
            start_resnum = line[22:28].lstrip().rstrip()
            end_resnum = line[40:46].lstrip().rstrip()
            end_chainid = line[46]

            if not chainid_dict.has_key(start_chainid):
                chainid_dict[start_chainid] = True
            if element_name in ["ALPHAHELIX", "PIHELIX", "310HELIX"]:
                if element_name == "ALPHAHELIX":
                    helixtype = "H"
                elif element_name == "PIHELIX":
                    helixtype = "I"
                elif element_name == "310HELIX":
                    helixtype = "G"
                else:
                    helixtype = None # can't happen, TypeError later if does
                element_tuple = (start_chainid, start_resnum,
                                 end_chainid, end_resnum,
                                 helixtype)
                pts.helix_list.append(element_tuple)
            elif element_name == "STRAND":
                element_tuple = (start_chainid, start_resnum,
                                end_chainid, end_resnum)
                pts.strand_list.append(element_tuple)

        elif rectype == "DNR": # donor of H-bond; requires stride -h
            # From stride.doc:
            #
            #      DNR    Donor residue
            #
            #      Format:  6-8  Donor residue name
            #             10-10 Protein chain identifier
            #             12-15 PDB residue number
            #             17-20 Ordinal residue number
            #             26-28 Acceptor residue name
            #             30-30 Protein chain identifier
            #             32-35 PDB residue number
            #             37-40 Ordinal residue number
            #             42-45 N..0 distance
            #             47-52 N..O=C angle
            #             54-59 O..N-C angle
            #             61-66 Angle between the planes of donor
            #                   complex and O..N-C
            #             68-73 angle between the planes of acceptor
            #                   complex and N..O=C
            #
            # Build a tuple representing the Hydrogen bond:
            #
            #   (chainid1, resnum1, chainid2, resnum2, dist_N_O)
            #
            # and add to the list of H-bonds.
            #
            hbond_tuple = ( line[9], line[11:15].lstrip(),
                            line[29], line[31:35].lstrip(),
                            float(line[41:45]) ) # Angstroms
            pts.hbond_list.append(hbond_tuple)
        elif (rectype == "FA1" or rectype == "FA2") and not hbonds_only:
            # NB: requires modified stride and -\$ -i options
            # FA1 and FA2 show beta bridges, 4 hbonds for a bridge.
            # Also note residues might be off-by-one on the end as this
            # record is internal stride information output before the
            # adjustments made for assigning E to a residue (see stride
            # source code sheet.c and fillasn.c). We don't care much
            # here, we'll just allow a fudge factor of 1 on begin and
            # end of strand when checking if these bridges connect them.
            # Example FA1 records:
            #
            #   FA1 AntiPar From: - - 3 69 69 3
            #   FA1 Par From: - - 2 30 32 2
            #
            # (note '-' for blank chain id as per usual stride convention).
            # Build a tuple representing this (part of a) bridge:
            #
            #   (chainid1, resnum1, chainid2, resnum2, bdir)
            #
            # dir is P for parallel or N for antiparallel
            #
            fa1_rec = line.split()
            if fa1_rec[1] == "AntiPar":
                bdir = "N"
            elif fa1_rec[1] == "Par":
                bdir = "P"
            else:
                sys.stderr.write("readstride.py: ignored bad "
                                 + rectype + " record: "
                                 + line + '\n')

            bridgepart_tuple1 = (fa1_rec[3], fa1_rec[5],
                                 fa1_rec[4], fa1_rec[6], bdir)
            bridgepart_tuple2 = (fa1_rec[4], fa1_rec[7],
                                 fa1_rec[3], fa1_rec[8], bdir)

            pts.bridgeres_list.append(bridgepart_tuple1)
            pts.bridgeres_list.append(bridgepart_tuple2)
    return pts



def stride_chainid_to_pdb_chainid(stride_chainid):
    """
    Convert a STRIDE chainid to a PDB chainid.
    STRIDE uses '-' for a 'blank' chainid while PDB uses ' ' (space).
    So all this does is return the stride_chainid unless it is '-', then
    it returns ' '.

    Parameters:
        stride_chainid - the STRIDE chain identifier
    Return value:
        PDB chain identifier corresponding to supplied stride_chainid
    """
    if stride_chainid == '-':
        return ' '
    else:
        return stride_chainid
    
def pdb_chainid_to_stride_chainid(pdb_chainid):
    """
    Convert a PDB chainid to a STRIDE chainid.
    STRIDE uses '-' for a 'blank' chainid while PDB uses ' ' (space).
    So all this does is return the pdb_chainid unless it is ' ', then
    it returns '-'.

    We always use STRIDE style chain identifiers ('-' not ' ') internally.
    Note that the remediated (2007) PDB files now no longer use blank
    chainid anyway, they always name the chain 'A' etc.
    
    Parameters:
        pdb_chainid - the PDB chain identifier
    Return value:
        STRIDE chain identifier corresponding to supplied pdb_chainid
    """
    if pdb_chainid == ' ':
        return '-'
    else:
        return pdb_chainid
    

def read_secstruct_from_dssp(pdb_filename, pdb_secstruct = None):
    """
    Build and return an instance of the PTSecStruct class
    which represents the secondary structure as assigned
    by the DSSP program
    (Kabsch and Sander 1983 "Dictionary of Protein Secondary Structure:
     Pattern Recognition of Hydrogen-Bonded and Geometrical Features"
     Biopolymers 22:2577-2637).

    Parameters:
        pdb_filename - filename of PDB file to run DSSP against
        pdb_secstruct (read/write) default None
                       - a previously built PTSecStruct
                        (eg from PDB HELIX and SHEET cards).
                        If this is not None, only hydrogen bond information
                        is read, and added to this PTSecstruct.

    Return value:
        PTSecStruct class representing secondary structure and H bonds
        ass assigned by DSSP (see following comments).
        
    The class is instantiated by supplying a file handle of an open
    PDB file (for reading)
    to the constructor. DSSP is run with that PDB file as input and
    the secondary structure assignments are parsed
    from the DSSP output.

    Note the DSSP executable is actually 'dsspcmbi' not 'dssp'.
    """

    if verbose:
        sys.stderr.write("running dsspcmbi...")
    fd = os.popen("dsspcmbi " + pdb_filename)
    ptsecstruct = parse_dssp_output(fd, pdb_secstruct)
    fd.close()
    if verbose:
        sys.stderr.write("done\n")
        sys.stderr.write(str(ptsecstruct))
    return ptsecstruct
    

def parse_dssp_output(filehandle, pdb_secstruct):
    """
    Build StrideStruct object data members using the DSSP output
    read from the supplied filehandle.
    See more comments in read_secstruct_from_dssp() above.
    As well as the DSSP paper by Kabsch and Sander in Biopolymers cited
    there, see:

    http://swift.cmbi.ru.nl/gv/dssp/

    Note that a licence is needed for DSSP (free for academic use -
    see website above).

    Parameters:
      filehandle - filehandle to read from (already open for read)
      pdb_secstruct (read/write)
                       - a previously built PTSecStruct
                        (eg from PDB HELIX and SHEET cards).
                        If this is not None, only hydrogen bond information
                        is read, and added to this PTSecstruct.

    Return value:
      PTSecStruct object representing secondary sturcture and H bonds
      as assigned by DSSP. 

    """
    if pdb_secstruct == None:
        pts = PTSecStruct()
        hbonds_only = False
    else:
        pts = pdb_secstruct
        hbonds_only = True
        
    in_assignments = False
    in_secstruct = False
    prev_dssp_code = None
    dssp_bp_tuple_list = [] # list of tuples
                            # (dssp_resnum, bp1, bp1_label, bp2, bp2_label)
                            # from DSSP
    dssp_hbond_tuple_list = [] # list of tuples
                               # (dssp_resnum, hbond_resnum, energy) from DSSP
    dssp_pdb_resnum_dict = {} # dict of {dssp_resnum : (pdb_resnum, chainid)}
    for line in filehandle:
        if line[:6] == "HEADER":
            pts.pdb_header = line[10:80].rstrip()
            pts.pdb_id = line.rstrip().split()[-2]
        elif line[:25] == "  #  RESIDUE AA STRUCTURE": # header marking assigns
            in_assignments = True # secondary struct assignments start next line
            continue
        elif in_assignments:
            # go through and identify helices and strands and
            # and them to helix_list and strand_list
            # Then bridges and H-bonds
            dssp_resnum = line[:5].lstrip()
            if not dssp_resnum.isdigit():
                sys.stderr.write(
                        'WARNING: bad DSSP residue sequence number '
                        'in DSSP output, '
                        'skipped line:\n' + line)
                continue
            dssp_resnum = int(dssp_resnum)
            pdb_resnum = line[5:11].lstrip().rstrip()
            residue = line[13]
            if not pdb_resnum.isdigit():
                if not (len(pdb_resnum) > 0 and
                         (pdb_resnum[0] == '-' # can be negative, e.g. in 2CZR
                          or pdb_resnum[-1].isalpha())): #can have insertion code
                    if residue != '!': # '!' indicates chain break
                        sys.stderr.write('WARNING: bad PDB residue sequence  '
                                         'number in DSSP output, '
                                         'skipped line:\n' + line)
                    continue
            pdb_icode = line[10] # aleady included in pdb_resnum also
            pdb_chainid = pdb_chainid_to_stride_chainid(line[11])
            dssp_pdb_resnum_dict[dssp_resnum] = (pdb_resnum, pdb_chainid)
            # The DSSP code:
            #   H = alpha helix 
            #   B = residue in isolated beta-bridge 
            #   E = extended strand, participates in beta ladder 
            #   G = 3-helix (3/10 helix) 
            #   I = 5 helix (pi helix) 
            #   T = hydrogen bonded turn 
            dssp_code = line[16]
            if dssp_code != prev_dssp_code:
                # add secondary structure elements, whose boundaries we
                # detect in DSSP output by change of DSSP secondary structure
                # summary code.
                # TODO: deal with discrmiinating between continuous helices
                #       (as per identifySSE2 in TableauCreator)
                if not in_secstruct:
                    if dssp_code in ['H','B','E','G','I']:
#                        print 'xxx START',dssp_code, pdb_resnum
                        in_secstruct = True
                        start_resnum = pdb_resnum
                        start_chainid = pdb_chainid
                else:
#                    print 'xxx END',prev_dssp_code, pdb_resnum,dssp_code
                    in_secstruct = False
                    end_resnum = prev_pdb_resnum
                    end_chainid = prev_pdb_chainid
                    if prev_dssp_code in ['H','G','I']:
                        helixtype = prev_dssp_code
                        sse_tuple = (start_chainid, start_resnum,
                                     end_chainid, end_resnum, helixtype)
                    else:
                        sse_tuple = (start_chainid, start_resnum,
                                     end_chainid, end_resnum)
                    if prev_dssp_code == 'E': # ignore 'B' isolated bridge
                        if (not hbonds_only):
                            pts.strand_list.append(sse_tuple)
                    elif prev_dssp_code in ['H', 'G', 'I']:
                        if (not hbonds_only):
                            pts.helix_list.append(sse_tuple)

                    if dssp_code in ['H','B','E','G','I']:
#                        print 'yyy START',dssp_code, pdb_resnum
                        in_secstruct = True
                        start_resnum = pdb_resnum
                        start_chainid = pdb_chainid

                prev_dssp_code = dssp_code
            # END of dssp_code != prev_dssp_code

            if dssp_code == 'E': # only use bridge partners from E not B
                bp1_resnum = int(line[26:29].lstrip())
                bp2_resnum = int(line[30:33].lstrip())
                
                bp1_label = line[23]
                bp2_label = line[24]
                dssp_bp_tuple_list.append((dssp_resnum,
                                           bp1_resnum, bp1_label,
                                           bp2_resnum, bp2_label))

            # Make tuple for each of four different H bond fields from DSSP
            nho1_fields= line[40:51].lstrip().rstrip().split(',') #offset,energy
            if int(nho1_fields[0]) != 0:
                nho1_resnum = dssp_resnum + int(nho1_fields[0])
                nho1_energy = float(nho1_fields[1])
                dssp_hbond_tuple_list.append((dssp_resnum,nho1_resnum,nho1_energy))
            ohn1_fields= line[51:62].lstrip().rstrip().split(',') #offset,energy
            if int(ohn1_fields[0]) != 0:
                ohn1_resnum = dssp_resnum + int(ohn1_fields[0])
                ohn1_energy = float(ohn1_fields[1])
                dssp_hbond_tuple_list.append((dssp_resnum,ohn1_resnum,ohn1_energy))
            nho2_fields= line[62:73].lstrip().rstrip().split(',') #offset,energy
            if int(nho2_fields[0]) != 0:
                nho2_resnum = dssp_resnum + int(nho2_fields[0])
                nho2_energy = float(nho2_fields[1])
                dssp_hbond_tuple_list.append((dssp_resnum,nho2_resnum,nho2_energy))
            ohn2_fields= line[73:84].lstrip().rstrip().split(',') #offset,energy
            if int(ohn2_fields[0]) != 0:
                ohn2_resnum = dssp_resnum + int(ohn2_fields[0])
                ohn2_energy = float(ohn2_fields[1])
                dssp_hbond_tuple_list.append((dssp_resnum,ohn2_resnum,ohn2_energy))
#            print 'xxx',nho1_fields,ohn1_fields,nho2_fields,ohn2_fields
            prev_pdb_resnum = pdb_resnum
            prev_pdb_chainid = pdb_chainid
            prev_residue = residue
            prev_pdb_icode = pdb_icode
    # END of iteration of lines in filehanlde
    filehandle.close()

    if not hbonds_only:
        for (dssp_resnum, bp1_resnum, bp1_label, bp2_resnum, bp2_label) \
                in dssp_bp_tuple_list:
            # convert DSSP BP1 and BP2 fields to
            # bridgeres_list format, a list of:
            # (chainid1, resnum1, chainid2, resnum2, bdir)
            # This involves converting the DSSP sequential residue numbers to
            # PDB residue numbers which we use
            (pdb_resnum, pdb_chainid) = dssp_pdb_resnum_dict[dssp_resnum]
            if bp1_resnum != 0: # 0 indicates no beta brdige partner in this field
                try:
                    (bp1_pdb_resnum, bp1_pdb_chainid) = dssp_pdb_resnum_dict[bp1_resnum]
                except KeyError:
                    sys.stderr.write('WARNING: no PDB residue found for DSSP '
                                     'residue ' + str(bp1_resnum) +
                                     '; bridge skipped\n')
                    continue
                # The beta bridge labels from DSSP (columns 23,24) are uppercase
                # for antiparallel bridges and lowercase for parallel bridges
                # (as per Kabsch & Sander 1983, p. 2581).
                if bp1_label.islower():
                    bdir = 'P'
                else:
                    bdir = 'N'
                pts.bridgeres_list.append((pdb_chainid, pdb_resnum, \
                                           bp1_pdb_chainid, bp1_pdb_resnum, \
                                           bdir)) 
            if bp2_resnum != 0:
                try:
                    (bp2_pdb_resnum, bp2_pdb_chainid) = dssp_pdb_resnum_dict[bp2_resnum]
                except:
                    sys.stderr.write('WARNING: no PDB residue found for DSSP '
                                     'residue ' + str(bp2_resnum) +
                                     '; bridge skipped\n')
                    continue
                if bp2_label.islower():
                    bdir = 'P'
                else:
                    bdir = 'N'
                pts.bridgeres_list.append((pdb_chainid, pdb_resnum, \
                                          bp2_pdb_chainid, bp2_pdb_resnum,\
                                          bdir))

    for (dssp_resnum, hbond_dssp_resnum, hbond_energy) in dssp_hbond_tuple_list:
        # convert the DSSP hydrogen bond fields to hbond_list format:
        # (chainid1, resnum1, chainid2, resnum2, energy)
        # TODO note energy (kcal/mol) for DSSP not N..O distance (Angstroms)
        (pdb_resnum, pdb_chainid) = dssp_pdb_resnum_dict[dssp_resnum]
        (hbond_resnum, hbond_chainid) = dssp_pdb_resnum_dict[hbond_dssp_resnum]
        if hbond_energy <= DSSP_HBOND_ENERGY_THRESHOLD:
            pts.hbond_list.append((pdb_chainid, pdb_resnum,
                                   hbond_chainid, hbond_resnum, hbond_energy))

    return pts


def read_secstruct_from_pdb_file(pdb_filename):
    """
    Parameters:
        pdb_filename - filename of PDB file to read HELIX and SHEET cards from.

    Return value:
        PTSecStruct class representing secondary structure or
        None if there are no HELIX and SHEET cards in PDB file.

    See the PDB format specification (secondary structure section) at
        http://www.wwpdb.org/documentation/format23/sect5.html
    """

    # Note we are ignoring the sheet groupings here (though maybe we should
    # use them) and relying on 'registration' fields to define bridge
    # relationships between strands so we discover the correct sheets
    # by connected components the same as when using DSSP or STRIDE.
    # TODO: need more work on this, especially with tricky constructions
    # like in 1MTP where there are duplicated SHEET lines for same
    # strand in multiple sheets.
    
    pts = PTSecStruct()
    found_secstruct = False
    fh = open(pdb_filename)
    for line in fh:
        if line[:6] == "ATOM  ":
            break # secondary struct section must be before coordinate section
        if line[:6] == "HELIX ":
            found_secstruct = True
            initChainID = pdb_chainid_to_stride_chainid(line[19])
            initSeqNum = line[21:26].lstrip().rstrip() # include insertion code
            endChainID = pdb_chainid_to_stride_chainid(line[31])
            endSeqNum = line[33:38].lstrip().rstrip() #include insertion code
            helixClass = int(line[38:40])
            if helixClass == 1 or helixClass == 6:  # alpha
                helixtype = "H"
            elif helixClass == 3:  # pi
                helixtype = "I"
            elif helixClass == 5:  # 3_10
                helixtype = "G"
            else:
                helixtype = "H"
            element_tuple = (initChainID, initSeqNum,
                             endChainID, endSeqNum,
                             helixtype)
            pts.helix_list.append(element_tuple)
        elif line[:6] == "SHEET ":
            found_secstruct = True
            initChainID = pdb_chainid_to_stride_chainid(line[21])
            initSeqNum = line[22:27].lstrip().rstrip() #include insertion code
            endChainID = pdb_chainid_to_stride_chainid(line[32])
            endSeqNum = line[33:38].lstrip().rstrip() # include insertion code
            element_tuple = (initChainID, initSeqNum,
                             endChainID, endSeqNum)
            # can get duplicated strands in different sheets, e.g. 1MTP
            if element_tuple not in pts.strand_list:
                pts.strand_list.append(element_tuple)
            # "Registration" gives us bridge relationships between strands
            sense = int(line[38:40])
            if sense == 0:
                first_strand = True # no registration information for 1st strand
            elif sense == 1: # parallel
                first_strand = False
                bdir = 'P'
            elif sense == -1: # anti-parallel
                first_strand = False
                bdir = 'N'
            else:
                sys.stderr.write('WARNING: bad sense (columns 39-40) in line:\n'
                                 + line + '\n')
                found_secstruct = False
                break
            if not first_strand:
                curChainId = pdb_chainid_to_stride_chainid(line[49])
                curResSeq = line[50:55].lstrip().rstrip() # include icode
                prevChainId = pdb_chainid_to_stride_chainid(line[64])
                prevResSeq = line[65:70].lstrip().rstrip() # include icode
                if curResSeq == '' or prevResSeq == '':
                    sys.stderr.write('WARNING: bad registration info in line:\n'
                                     + line + '\n')
                    found_secstruct = False
                    break
                bridge_tuple = (curChainId, curResSeq, prevChainId, prevResSeq,
                                bdir)
                pts.bridgeres_list.append(bridge_tuple)
            
    fh.close()

    if not pts.check_validity_and_fix():
        return None
    
    # sort the strand and helix lists (by chainid,startresnum ascending)
    # so that the numbering when we build graph is by sequence in chain
    # TOOD: this is to match what we do with DSSP and STRIDE, but maybe
    # there should be an option to use strand numbering from PDB file
    # (typically strands numbered left to right in sheet).
    pts.strand_list.sort(cmp=tuplecmp)
    pts.helix_list.sort(cmp=tuplecmp)

#    print 'zzzzz',pts.helix_list

    if found_secstruct:
        return pts
    else:
        return None

def ptsecstruct_set_verbose(verb):
    """
    set the module global verbose flag in this module to supplied value
    Parameters: verb - True (for verbose output) or False
    Return value: None
    Uses globals: verbose (in this module)
    """
    global verbose
    verbose = verb
    

def tuplecmp(tup1, tup2):
    """
    Comparison function for (chain, pdb_seqres_strt, pdb_seqres_end, type)
    tuples used by sort in PTSecStruct.check_validity() and
    PTSecStruct.__str__() functions.
    """
    if tup1[0] < tup2[0]:
        return -1
    elif tup1[0] > tup2[0]:
        return 1
    else:
        return pdb_res_seq_cmp(tup1[1], tup2[1])
    

def write_pml_define_sses(fh, domid, sse_list):
    """
    Write PyMOL commands to define the SSEs according to our (DSSP or STRIDE
    or other) definitions rather than the PDB or PyMOL dss definitions.

    Parameters:
       fh - open (write) filehandle to write PyMOL commands to
       domid - structure identifier to define SSEs in
       sse_list - list of  (chain,start,end,type) tuples specifyign SSEs in struct
    Return value: None
    """
    fh.write("alter /"+domid + ", ss='L'\n") # make entire struct loop region
    fh.write("rebuild\n")
    for (chain, start_resi, end_resi, ssetype) in sse_list:
        if ssetype == 'E':
            pml_ssetype = 'S'
        elif ssetype == 'H':
            pml_ssetype = 'H'
        else:
            raise ValueError('bad sse type ' + ssetype + '\n')
        fh.write('alter  ' + '/' + domid + '//' + chain +
                 '/' + str(start_resi) + '-' + str(end_resi) +
                 ", ss = '" + pml_ssetype + "'\n")
    fh.write("rebuild\n")


def write_pymol_load(fh, domid, pdbfile):
    """
    Write comands to load specified sturcture into PyMOL

    Parameters:
       fh - open (write) filehandle to write to
       domid - identifier of sructure
       pdbfile - PDB file to get SSEs for
       
    Return value: None
    """
    fh.write('load ' + pdbfile + '\n')
#    fh.write('color ' + color + ', /'+domid + '\n')




def write_pymol_prelude(fh):
    """
    Write startup information in PyMOL script (.pml) file.

    Parameters:
       fh - open (write) filehandle to write to
    Return value: None
    """
    fh.write('# generated by ' + " ".join(sys.argv) +'\n')
    timestamp = strftime("%d%b%Y %H:%M:%S", localtime())
    fh.write('# on ' + timestamp + '\n')
    fh.write('#\n')    



def write_pymol_conclusion(fh):
    """
    Write finalizing information in PyMOL script (.pml) file.

    Parameters:
       fh - open (write) filehandle to write to
    Return value: None
    """
    fh.write('hide everything\n')
    fh.write('show cartoon\n')


def read_secstruct_from_pmml(pdb_filename):
    """
    Build and return an instance of the PTSecStruct class
    which represents the secondary structure as assigned
    by the PML program
    
    Konagurthu, Allison, Stuckey, Lesk 2011 "Piecewise linear approximation
    of protein structures using the principle of minimum message length"
    Bioinformatics 27(13):i43-i51 (ISMB 2011, Vienna)   

    Available from http://www.csse.monash.edu.au/~karun/pmml 
    (executables only, no source).

    NB this only provides SSE assignment, not H bond or bridge information
    so cannot be used for sheet idneitifcation etc. for Pro-Origami,
    only impelemented to be used in pytableaucreate.py to make
    tableaux from PMML rather than DSSP or STRIDE.
    (If it was really needed for some reason we could maybe run DSSP
    afterwards to get this info, as is done when parse SSEs from PDB
    files then running DSSP or STRIDE for hbonds).

    Also, having been added as an afterthought just to try out tableau
    searching with PMML tableaux (should also try PALSSE some time)
    this is not thoroughly tested (unlike DSSP and STRIDE) and may
    only work on ASTRAL type pdb files (cleaned up, no funny insertion
    code tricks, empty chain ids, etc.) [In fact, it doesn't even work with
    all ASTRAL files e.g. d1ayea2 which has an insertion code - pmml works
    but leaves out insertion codes entirely so cannot match up with Bio.PDB
    resid stuff in our code, causes all sorts of problems]

    Parameters:
        pdb_filename - filename of PDB file to run STRIDE against

    Return value:
        PTSecStruct class representing secondary structure and H bonds
        as assigned by PMML.
        
    The class is instantiated by supplying a file handle of an open
    PDB file (for reading)
    to the constructor. PMML is run with that PDB file as input and
    the secondary structure assignments are parsed
    from the PMML output.
    """

    if verbose:
        sys.stderr.write("running pmml...")
    fd = os.popen("pmml " + pdb_filename)
    ptsecstruct = parse_pmml_output(fd)
    fd.close()
    if verbose:
        sys.stderr.write("done\n")
        sys.stderr.write(str(ptsecstruct))
    return ptsecstruct
    
        
def parse_pmml_output(filehandle):
    """
    Build StrideStruct object data members using the STRIDE output
    read from the supplied filehandle.
    See more comments in read_secstruct_from_stride() above.

    Parameters:
      filehandle - filehandle to read from (already open for read)

    Return value:
      PTSecStruct object representing secondary sturcture and H bonds
      as assigned by PMML. 

    """
    pts = PTSecStruct()

    # PMML output (only the residue level SSE assignment part, skip rest):
    #
    #
    #..........................................................................
    #..........RESIDUE LEVEL SECONDARY STRUCTURE ASSIGNMENT DETAILS............
    #!    Chn |-- residue --| SSType
    #    1 A     1 M   (MET1)  
    #    2 A     2 Q   (GLN2) E
    #    3 A     3 I   (ILE3) E
    #    4 A     4 F   (PHE4) E
    #    5 A     5 V   (VAL5) E
    #    6 A     6 K   (LYS6) E
    #    7 A     7 T   (THR7) E
    #    8 A     8 L   (LEU8)  
    #    9 A     9 T   (THR9)  
    #   10 A    10 G  (GLY10) E
    #   11 A    11 K  (LYS11) E
    #   12 A    12 T  (THR12) E
    #
    # Note we must use column positions not whitespace split as they are
    # in fixed positions and can end up with no space between fields when
    # residue numbers are large enough e.g. d2e2dc1
    #
    # Position     Decription
    # 1 - 5        PMML sequential residue number (?)
    #     7        Chain
    # 10-13        Residue number
    #    15        Residue one-letter code
    # 16-24        Residue 3-letter code + number, in parentheses
    #    26        SSE type (DSSP code)
    #

    at_residue_assignment = False
    in_secstruct = False
    prev_dssp_code = None
    for line in filehandle:
        if "RESIDUE LEVEL SECONDARY STRUCTURE ASSIGNMENT" in line:
            at_residue_assignment = True
        elif line[0] == '!':
            continue
        elif at_residue_assignment:
            #
            # Build helix_list and strand_list from PMMLresidue SSE assignments
            # types AlphaHelix/310Helix/PiHelix and Strand respectively.
            # The helix_list is a list of tuples
            #
            #  (start_chainid, start_resnum, end_chainid, end_resnum, type)
            #
            # and the strand_list is the same format.
            #

            pmml_resnum = int(line[:5])
            pdb_chainid = line[6]
            pdb_resnum = line[9:13].lstrip().rstrip()
            residue_oneletter = line[14]
            residue_string = line[15:24].lstrip().rstrip()
            dssp_code = line[25]
            # Uses (some of) the DSSP code:
            #   H = alpha helix 
            #   E = extended strand, participates in beta ladder 
            #   G = 3-helix (3/10 helix) 
            #   I = 5 helix (pi helix) 
            if dssp_code != prev_dssp_code:
                # add secondary structure elements, whose boundaries we
                # detect in output by change of secondary structure
                # summary code.
                # TODO: deal with discrmiinating between continuous helices
                #       (as per identifySSE2 in TableauCreator)
                if not in_secstruct:
                    if dssp_code in ['H','E','G','I']:
#                        print 'xxx START',dssp_code, pdb_resnum
                        in_secstruct = True
                        start_resnum = pdb_resnum
                        start_chainid = pdb_chainid
                else:
#                    print 'xxx END',prev_dssp_code, pdb_resnum,dssp_code
                    in_secstruct = False
                    end_resnum = prev_pdb_resnum
                    end_chainid = prev_pdb_chainid
                    if prev_dssp_code in ['H','G','I']:
                        helixtype = prev_dssp_code
                        sse_tuple = (start_chainid, start_resnum,
                                     end_chainid, end_resnum, helixtype)
                    else:
                        sse_tuple = (start_chainid, start_resnum,
                                     end_chainid, end_resnum)
                    if prev_dssp_code == 'E':
                        pts.strand_list.append(sse_tuple)
                    elif prev_dssp_code in ['H', 'G', 'I']:
                        pts.helix_list.append(sse_tuple)

                    if dssp_code in ['H','E','G','I']:
#                        print 'yyy START',dssp_code, pdb_resnum
                        in_secstruct = True
                        start_resnum = pdb_resnum
                        start_chainid = pdb_chainid

                prev_dssp_code = dssp_code
            # END of dssp_code != prev_dssp_code

            prev_pdb_resnum = pdb_resnum
            prev_pdb_chainid = pdb_chainid

    return pts


